The goal of this notebook is to explore by different ways the companies' bankruptcy data, analyse the general behavior of data and all form of correlation between the variables and finally to know what variables are useless and have to be ejected.
import numpy as np
import pandas as pd
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import ipywidgets
from bokeh.io import output_notebook, show, push_notebook
from bokeh.plotting import figure as bkfigure
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
output_notebook()
init_notebook_mode()
warnings.filterwarnings("ignore")
with open('../data/description.json', 'r') as f:
description = json.loads(f.read())
description['X65'] = 'bankrupt'
dirpath = Path('../data/extension/csv')
lstfile = list(dirpath.glob('*year.csv'))
lstfile
[WindowsPath('../data/extension/csv/1year.csv'),
WindowsPath('../data/extension/csv/2year.csv'),
WindowsPath('../data/extension/csv/3year.csv'),
WindowsPath('../data/extension/csv/4year.csv'),
WindowsPath('../data/extension/csv/5year.csv')]
dataset = {i.stem:pd.read_csv(i) for i in lstfile}
shapes = {x:y.shape for x, y in dataset.items()}
shapes
{'1year': (7026, 65),
'2year': (10172, 65),
'3year': (10502, 65),
'4year': (9791, 65),
'5year': (5909, 65)}
plt.figure(figsize=(12,6))
plt.title('Total number of companies in each dataset', fontweight='bold')
ax = plt.bar(shapes.keys(), [s[0] for s in shapes.values()],
color=['darkred' if s[0] < 10000 else 'darkblue' for s in shapes.values()])
bankruptcy = pd.concat([x.X65.value_counts() for x in dataset.values()], axis=1, ignore_index=True)
bankruptcy.columns = dataset.keys()
bankruptcy.index = ['no bankrupt', 'bankrupt']
bankruptcy
| 1year | 2year | 3year | 4year | 5year | |
|---|---|---|---|---|---|
| no bankrupt | 6755 | 9772 | 10007 | 9276 | 5499 |
| bankrupt | 271 | 400 | 495 | 515 | 410 |
ax = bankruptcy.T.plot.bar(log=True, color=['lime','crimson'], figsize=(12,6), title='Bankrupt companies')
for p in ax.patches:
ax.annotate(p.get_height(), (p.get_x() + p.get_width() / 2., 10), ha = 'center', va = 'center')
In these datasets, the missing values are depicted by a "?"
missing_val = {k:(v == '?').sum().sum() for k,v in dataset.items()}
freq = {k:(v == '?').sum().sum() / np.multiply(*v.shape) for k,v in dataset.items()}
f, ax = plt.subplots(1,2, figsize=(15,6))
f.suptitle('Missing values')
ax[0].barh(list(missing_val.keys()), missing_val.values(), color='purple')
ax[0].set_title('Total number')
ax[1].barh(list(freq.keys()), freq.values(), color='orange')
ax[1].set_title('Frequency')
Text(0.5, 1.0, 'Frequency')
n = 3
pd.options.display.max_rows = 100
data = pd.read_csv(dirpath / 'data.csv')
data.sample(5)
| year | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | ... | X56 | X57 | X58 | X59 | X60 | X61 | X62 | X63 | X64 | X65 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 38806 | 5year | 0.049595 | 0.34095 | 0.60606 | 3.1313 | 58.498 | 0 | 0.049595 | 1.933 | 1.3639 | ... | 0.31025 | 0.075253 | 0.69923 | 0 | 2.9919 | 7.2586 | 76.101 | 4.7963 | 12.447 | 0 |
| 26531 | 3year | 0.40329 | 0.17028 | 0.34313 | 3.0283 | 37.662 | 0.8706 | 0.40329 | 4.4368 | 1.3541 | ... | 0.26151 | 0.53382 | 0.73849 | 0.001466 | 6.0106 | 8.0205 | 43.585 | 8.3744 | 2.9048 | 0 |
| 953 | 1year | 0.062973 | 0.80734 | 0.027716 | 1.0494 | -35.147 | 0 | 0.062973 | 0.23864 | 1.7158 | ... | 0.1369 | 0.32686 | 0.87162 | 0.47438 | 10.303 | 4.2038 | 119.33 | 3.0587 | 4.1716 | 0 |
| 36135 | 4year | 0.097085 | 0.46396 | 0.14401 | 1.3107 | -17.077 | -0.029973 | 0.12389 | 1.1553 | 3.7971 | ... | 0.035188 | 0.18112 | 0.96755 | 0 | 12.206 | 13.173 | 44.561 | 8.191 | 9.6762 | 0 |
| 38447 | 5year | 0.062268 | 0.013664 | 0.20865 | 18.017 | 142.7 | 0.10974 | 0.071432 | 46.073 | 1.0425 | ... | 0.040792 | 0.09891 | 0.95921 | 0.002228 | ? | 4.1484 | 26.886 | 13.576 | 0.21366 | 0 |
5 rows × 66 columns
missing = (data == '?')
color = ['r' if x > 1000 else ('orange' if x > 100 else 'limegreen') for x in missing.sum(axis=0)]
missing.sum(axis=0).plot(kind='bar', figsize=(15,5), log=True, color=color)
plt.title(f'Number of missing values per variable in the full dataset')
Text(0.5, 1.0, 'Number of missing values per variable in the full dataset')
missing_col = pd.DataFrame({'missing values': missing.sum(axis=0).values},
index = missing.columns.values)
missing_col = np.array([(x,', '.join(y.index.values))
for x,y in missing_col.groupby('missing values')])
missing_col = pd.DataFrame({'Number of missing values':missing_col[:,0].astype(int),
'Frequence': missing_col[:,0].astype(int)/len(data)},
index=missing_col[:,1])
missing_col.sort_values('Number of missing values', ascending=False)\
.style.background_gradient(cmap='Blues', vmax=500, vmin=20, low=0.1, high=0.9)
| Number of missing values | Frequence | |
|---|---|---|
| X37 | 18984 | 0.437419 |
| X21 | 5854 | 0.134885 |
| X27 | 2764 | 0.063687 |
| X60 | 2152 | 0.049585 |
| X45 | 2147 | 0.049470 |
| X24 | 922 | 0.021244 |
| X28, X53, X54, X64 | 812 | 0.018710 |
| X41 | 754 | 0.017373 |
| X32 | 368 | 0.008479 |
| X52 | 301 | 0.006935 |
| X47 | 297 | 0.006843 |
| X46 | 135 | 0.003111 |
| X4, X12, X33, X40, X63 | 134 | 0.003088 |
| X19 | 128 | 0.002949 |
| X13, X20, X23, X30, X31, X39, X42, X43, X44, X49, X56, X62 | 127 | 0.002926 |
| X61 | 102 | 0.002350 |
| X16, X26 | 95 | 0.002189 |
| X8, X17, X34, X50 | 94 | 0.002166 |
| X5 | 89 | 0.002051 |
| X58 | 84 | 0.001935 |
| X11 | 44 | 0.001014 |
| X15 | 36 | 0.000829 |
| X9, X48 | 9 | 0.000207 |
| X1, X2, X3, X6, X7, X10, X14, X18, X22, X25, X29, X35, X36, X38, X51 | 8 | 0.000184 |
| X57, X59 | 7 | 0.000161 |
| X55 | 1 | 0.000023 |
| year, X65 | 0 | 0.000000 |
plt.figure(figsize=(10,5))
missing_row = missing.sum(axis=1).value_counts().sort_index()
col = ['darkred' if x > 10 else 'darkgreen' for x in missing_row.index]
plt.bar(missing_row.index.astype(str), missing_row.values, log=True, color=col)
plt.title('Count of missing values per row')
plt.xlabel('Number of missing values')
plt.ylabel('Number of company')
plt.legend(['To keep', 'To remove'])
plt.axvline(10, c='r', lw=4, ls=':')
<matplotlib.lines.Line2D at 0x19c97b3f370>
We know now which variables (upper to 1000) and companies (upper to 10) we have to remove because of their large number of missing values
useless_col = missing.sum(axis=0)[missing.sum(axis=0) >= 1000]
print('Variable to remove',
'\n'.join([str(var) + ': ' + description[var] for var in useless_col.index]),sep='\n')
Variable to remove X21: sales (n) / sales (n-1) X27: profit on operating activities / financial expenses X37: (current assets - inventories) / long-term liabilities X45: net profit / inventory X60: sales / inventory
missing.drop(columns=useless_col.index, inplace=True)
useless_row = missing.sum(axis=1)[missing.sum(axis=1) >= 10]
useless_row.name = 'Useless companies'
len(useless_row)
213
missing.drop(index=useless_row.index, inplace=True)
print("old dataset : ", (data == '?').sum().sum(), "omitted\nshape :", (data == '?').shape)
print("dataset cleaned : ", missing.sum().sum(), "omitted\nshape :", missing.shape)
old dataset : 41322 omitted shape : (43400, 66) dataset cleaned : 6019 omitted shape : (43187, 61)
Replace the last omitted values by the mean
for col in data.drop(columns=['year', 'X65']):
data.loc[data[col] == '?', col] = data.loc[data[col] != '?', col].astype(float).mean()
data[data.columns.drop(['year', 'X65'])] = data[data.columns.drop(['year', 'X65'])].astype(float)
data.X65 = data.X65.astype('category')
data.year = data.year.astype('category')
(data == '?').sum().sum()
0
To use the graph interaction, launch this notebook with jupyter
f = bkfigure(plot_height=350, plot_width=600)
scatter = f.scatter(data.X1.values, data.X65.values)
show(f, notebook_handle=True)
def change_scatter(var1, var2):
print('Variable on X axis :', description[var1],'\nVariable on Y axis :', description[var2])
scatter.data_source.data['x'] = data[var1].values
scatter.data_source.data['y'] = data[var2].values
push_notebook()
dd1 = ipywidgets.Dropdown(options=data.columns, value='X1', description='On x axis :')
dd2 = ipywidgets.Dropdown(options=data.columns, value='X65', description='On y axis :')
_ = ipywidgets.interact(change_scatter, var1=dd1, var2=dd2)
Boxplot of the 20 first variables depending on the bankruptcy variable
fig = make_subplots(3, 5, horizontal_spacing=0.02)
idx = np.array([(x,y) for x in range(1,4) for y in range(1,6)])
predictors = ['safe' if x == 0 else 'bankrupt' for x in data['X65'].values]
for i, x in enumerate(data.drop(['year','X65'], axis=1).columns[0:15]):
row, col = idx[i]
fig.append_trace(go.Box(y=data[x], x=predictors, name=x), row=row, col=col)
y_min, y_max = data[x].sort_values().iloc[30], data[x].sort_values().iloc[-30]
fig.update_yaxes(range=(y_min, y_max), row=row, col=col)
fig.update_xaxes(title=x, row=row, col=col)
fig.update_layout(height=800, width=2000)
1 1 2 1 3 1 4 1 5 1 1 2 2 2 3 2 4 2 5 2 1 3 2 3 3 3 4 3 5 3